from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
def zip_to_df(link):
"""
Reads ATUS zip files into pandas dataframes.
Args:
link: str
Returns:
pandas.DataFrame
"""
with urlopen(link) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
txt1 = zfile.read("_".join(link.split('/')[-1].split('.')[0].split('-')) + '.dat')
txt_lst = [i.split(',') for i in txt1.decode("utf-8").split('\r\n')]
df1 = pd.DataFrame(txt_lst)
df1.columns = df1.iloc[0]
df1 = df1[1:]
return df1
import pandas as pd
df = zip_to_df('https://www.bls.gov/tus/special.requests/atusact-0320.zip')
df.head()
| TUCASEID | TUACTIVITY_N | TUACTDUR24 | TUCC5 | TUCC5B | TRTCCTOT_LN | TRTCC_LN | TRTCOC_LN | TUSTARTTIM | TUSTOPTIME | ... | TRTONHH_LN | TRTOHH_LN | TRTHH_LN | TRTNOHH_LN | TEWHERE | TUCC7 | TRWBELIG | TRTEC_LN | TUEC24 | TUDURSTOP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 20030100013280 | 1 | 60 | -1 | -1 | -1 | -1 | -1 | 04:00:00 | 05:00:00 | ... | -1 | -1 | -1 | -1 | 9 | -1 | -1 | -1 | -1 | -1 |
| 2 | 20030100013280 | 2 | 30 | -1 | -1 | -1 | -1 | -1 | 05:00:00 | 05:30:00 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 3 | 20030100013280 | 3 | 600 | -1 | -1 | -1 | -1 | -1 | 05:30:00 | 15:30:00 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
| 4 | 20030100013280 | 4 | 150 | -1 | -1 | -1 | -1 | -1 | 15:30:00 | 18:00:00 | ... | -1 | -1 | -1 | -1 | 1 | -1 | -1 | -1 | -1 | -1 |
| 5 | 20030100013280 | 5 | 5 | -1 | -1 | -1 | -1 | -1 | 18:00:00 | 18:05:00 | ... | -1 | -1 | -1 | -1 | 1 | -1 | -1 | -1 | -1 | -1 |
5 rows × 29 columns
df.shape
(4276393, 29)
df_ec = zip_to_df('https://www.bls.gov/tus/special.requests/atusrostec-1120.zip')
df_ec.head()
| TUCASEID | TEAGE_EC | TEELDUR | TEELWHO | TEELYRS | TRELHH | TUECLNO | TULINENO | |
|---|---|---|---|---|---|---|---|---|
| 1 | 20110101110074 | 70 | 4 | 44 | 2 | 0 | 5 | -1 |
| 2 | 20110101110156 | 85 | 4 | 46 | 2 | 0 | 5 | -1 |
| 3 | 20110101110507 | 80 | 1 | 55 | -1 | 0 | 2 | -1 |
| 4 | 20110101110521 | 85 | 3 | 43 | -1 | 0 | 3 | -1 |
| 5 | 20110101110522 | 80 | 4 | 44 | 6 | 0 | 2 | -1 |
df_ec.shape
(26436, 8)
elec = df[df['TUCASEID'].isin(df_ec['TUCASEID'])]
elec.head()
| TUCASEID | TUACTIVITY_N | TUACTDUR24 | TUCC5 | TUCC5B | TRTCCTOT_LN | TRTCC_LN | TRTCOC_LN | TUSTARTTIM | TUSTOPTIME | ... | TRTONHH_LN | TRTOHH_LN | TRTHH_LN | TRTNOHH_LN | TEWHERE | TUCC7 | TRWBELIG | TRTEC_LN | TUEC24 | TUDURSTOP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2228592 | 20110101110074 | 1 | 10 | 0 | 0 | 0 | -1 | 0 | 04:00:00 | 04:10:00 | ... | -1 | -1 | -1 | -1 | 1 | 0 | -1 | -1 | -1 | -1 |
| 2228593 | 20110101110074 | 2 | 300 | 0 | 0 | 0 | -1 | 0 | 04:10:00 | 09:10:00 | ... | -1 | -1 | -1 | -1 | 1 | 0 | -1 | -1 | -1 | -1 |
| 2228594 | 20110101110074 | 3 | 5 | 0 | 0 | 0 | -1 | 0 | 09:10:00 | 09:15:00 | ... | -1 | -1 | -1 | -1 | 13 | 0 | -1 | -1 | -1 | -1 |
| 2228595 | 20110101110074 | 4 | 525 | 0 | 0 | 0 | -1 | 0 | 09:15:00 | 18:00:00 | ... | -1 | -1 | -1 | -1 | 3 | 0 | -1 | -1 | -1 | -1 |
| 2228596 | 20110101110074 | 5 | 5 | 0 | 0 | 0 | -1 | 0 | 18:00:00 | 18:05:00 | ... | -1 | -1 | -1 | -1 | 13 | 0 | -1 | -1 | -1 | -1 |
5 rows × 29 columns
elec.shape
(395956, 29)
not_elec = df[~df['TUCASEID'].isin(df_ec['TUCASEID'])]
not_elec.shape
(3880437, 29)
resp = zip_to_df('https://www.bls.gov/tus/special.requests/atusresp-0320.zip')
resp.head()
| TUCASEID | TULINENO | TESPUHRS | TRDTIND1 | TRDTOCC1 | TRERNHLY | TRERNUPD | TRHERNAL | TRHHCHILD | TRIMIND1 | ... | TRWBMODR | TRTALONE_WK | TRTCCC_WK | TRLVMODR | TRTEC | TUECYTD | TUELDER | TUELFREQ | TUELNUM | TU20FWGT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 20030100013280 | 1 | -1 | 40 | 8 | 2200 | 1 | 1 | 2 | 15 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1.000000 |
| 2 | 20030100013344 | 1 | 50 | 16 | 16 | -1 | 1 | -1 | 1 | 5 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1.000000 |
| 3 | 20030100013352 | 1 | -1 | 43 | 15 | 1250 | 0 | 0 | 2 | 16 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1.000000 |
| 4 | 20030100013848 | 1 | 40 | -1 | -1 | -1 | -1 | -1 | 1 | -1 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1.000000 |
| 5 | 20030100014165 | 1 | -1 | 42 | 10 | -1 | -1 | -1 | 1 | 16 | ... | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1.000000 |
5 rows × 133 columns
## tufnwgtp "ATUS final weight"
resp['tufnwgtp'.upper()].value_counts()
-1.000000 8782
1949516.757600 3
2813614.979566 3
3204883.318559 3
2008843.914803 3
...
4163618.367498 1
4176871.081592 1
10545923.331287 1
22580032.442562 1
4376875.228710 1
Name: TUFNWGTP, Length: 210064, dtype: int64
resp.groupby('tuyear'.upper()).count()
| TUCASEID | TULINENO | TESPUHRS | TRDTIND1 | TRDTOCC1 | TRERNHLY | TRERNUPD | TRHERNAL | TRHHCHILD | TRIMIND1 | ... | TRWBMODR | TRTALONE_WK | TRTCCC_WK | TRLVMODR | TRTEC | TUECYTD | TUELDER | TUELFREQ | TUELNUM | TU20FWGT | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| TUYEAR | |||||||||||||||||||||
| 2003 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | ... | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 | 20720 |
| 2004 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | ... | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 | 13973 |
| 2005 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | ... | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 | 13038 |
| 2006 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | ... | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 | 12943 |
| 2007 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | ... | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 | 12248 |
| 2008 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | ... | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 | 12723 |
| 2009 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | ... | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 | 13133 |
| 2010 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | ... | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 | 13260 |
| 2011 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | ... | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 | 12479 |
| 2012 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | ... | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 | 12443 |
| 2013 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | ... | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 | 11385 |
| 2014 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | ... | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 | 11592 |
| 2015 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | ... | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 | 10905 |
| 2016 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | ... | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 | 10493 |
| 2017 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | ... | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 | 10223 |
| 2018 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | ... | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 | 9593 |
| 2019 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | ... | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 | 9435 |
| 2020 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | ... | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 | 8782 |
18 rows × 132 columns
resp.loc[resp['tu20fwgt'.upper()]!='-1.000000', ['tu20fwgt'.upper()]].astype(float).describe()
| TU20FWGT | |
|---|---|
| count | 1.821700e+04 |
| mean | 9.057423e+06 |
| std | 9.002650e+06 |
| min | 0.000000e+00 |
| 25% | 3.668674e+06 |
| 50% | 6.634049e+06 |
| 75% | 1.151549e+07 |
| max | 1.371517e+08 |
resp.loc[resp['tufnwgtp'.upper()]!='-1.000000', 'tufnwgtp'.upper()].astype(float).describe()
count 2.105860e+05 mean 7.219147e+06 std 6.872690e+06 min 4.194717e+05 25% 2.949501e+06 50% 5.248605e+06 75% 8.856077e+06 max 2.090100e+08 Name: TUFNWGTP, dtype: float64
## Because 2020 data introduces a lot of noise into the data (due to collection issues and the pandemic, it is excluded in the following analysis)
weights = resp.loc[resp['tufnwgtp'.upper()]!='-1.000000', ['TUCASEID', 'tufnwgtp'.upper()]]
weights['TUFNWGTP'] = weights['TUFNWGTP'].astype(float)
weights.shape
(210587, 2)
weights['TUFNWGTP'].describe()
count 2.105860e+05 mean 7.219147e+06 std 6.872690e+06 min 4.194717e+05 25% 2.949501e+06 50% 5.248605e+06 75% 8.856077e+06 max 2.090100e+08 Name: TUFNWGTP, dtype: float64
weights['TUFNWGTP'].sum()
1520251316407.2427
weights['adj_weights'] = weights['TUFNWGTP']*210587/1520251316407.2427
weights['adj_weights'].describe()
count 210586.000000 mean 1.000005 std 0.952013 min 0.058106 25% 0.408568 50% 0.727043 75% 1.226754 max 28.952315 Name: adj_weights, dtype: float64
weights.columns = ['caseid', 'r_weights', 'adj_weights']
## I am using weights for 2003-2019 for the full sample to be added up to 1, that means that weights
## for the subsample of elder caregivers is not necessarily equal to 1.
## demog variables (age) are in the atussum file but not in others open right now
atussum = zip_to_df('https://www.bls.gov/tus/special.requests/atussum-0320.zip')
atussum.head()
| TUCASEID | GEMETSTA | GTMETSTA | PEEDUCA | PEHSPNON | PTDTRACE | TEAGE | TELFS | TEMJOT | TESCHENR | ... | t181801 | t181899 | t189999 | t500101 | t500103 | t500104 | t500105 | t500106 | t500107 | t509989 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 20030100013280 | 1 | -1 | 44 | 2 | 2 | 60 | 2 | 2 | -1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 20030100013344 | 2 | -1 | 40 | 2 | 1 | 41 | 1 | 2 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 20030100013352 | 1 | -1 | 41 | 2 | 1 | 26 | 2 | 2 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 20030100013848 | 2 | -1 | 39 | 2 | 2 | 36 | 4 | -1 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 20030100014165 | 2 | -1 | 45 | 2 | 1 | 51 | 1 | 2 | -1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 456 columns
demog = atussum[['TUCASEID', 'TUDIARYDAY', 'TEAGE']]
demog.columns = ['caseid', 'day', 'age']
demog.head()
| caseid | day | age | |
|---|---|---|---|
| 1 | 20030100013280 | 6 | 60 |
| 2 | 20030100013344 | 7 | 41 |
| 3 | 20030100013352 | 7 | 26 |
| 4 | 20030100013848 | 5 | 36 |
| 5 | 20030100014165 | 5 | 51 |
demog.age.value_counts()
80 6792
39 4526
37 4491
43 4476
40 4455
...
19 1725
20 1674
21 1653
78 1639
79 1501
Name: age, Length: 67, dtype: int64
demog_filter = demog.dropna()
demog_filter = demog_filter[((demog_filter['day']!='1') & (demog_filter['day']!='7'))]
demog_filter.shape
(109287, 3)
demog_filter = demog_filter[(demog_filter['age'].astype(int)>=20) & (demog_filter['age'].astype(int)<65)]
demog_filter.shape
(80516, 3)
elec = elec[elec['TUCASEID'].isin(demog_filter['caseid'])]
elec.shape
(148798, 29)
not_elec = not_elec[not_elec['TUCASEID'].isin(demog_filter['caseid'])]
not_elec.shape
(1528735, 29)
elec.columns
Index(['TUCASEID', 'TUACTIVITY_N', 'TUACTDUR24', 'TUCC5', 'TUCC5B',
'TRTCCTOT_LN', 'TRTCC_LN', 'TRTCOC_LN', 'TUSTARTTIM', 'TUSTOPTIME',
'TRCODEP', 'TRTIER1P', 'TRTIER2P', 'TUCC8', 'TUCUMDUR', 'TUCUMDUR24',
'TUACTDUR', 'TR_03CC57', 'TRTO_LN', 'TRTONHH_LN', 'TRTOHH_LN',
'TRTHH_LN', 'TRTNOHH_LN', 'TEWHERE', 'TUCC7', 'TRWBELIG', 'TRTEC_LN',
'TUEC24', 'TUDURSTOP'],
dtype='object', name=0)
elec.head()
| TUCASEID | TUACTIVITY_N | TUACTDUR24 | TUCC5 | TUCC5B | TRTCCTOT_LN | TRTCC_LN | TRTCOC_LN | TUSTARTTIM | TUSTOPTIME | ... | TRTONHH_LN | TRTOHH_LN | TRTHH_LN | TRTNOHH_LN | TEWHERE | TUCC7 | TRWBELIG | TRTEC_LN | TUEC24 | TUDURSTOP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2228592 | 20110101110074 | 1 | 10 | 0 | 0 | 0 | -1 | 0 | 04:00:00 | 04:10:00 | ... | -1 | -1 | -1 | -1 | 1 | 0 | -1 | -1 | -1 | -1 |
| 2228593 | 20110101110074 | 2 | 300 | 0 | 0 | 0 | -1 | 0 | 04:10:00 | 09:10:00 | ... | -1 | -1 | -1 | -1 | 1 | 0 | -1 | -1 | -1 | -1 |
| 2228594 | 20110101110074 | 3 | 5 | 0 | 0 | 0 | -1 | 0 | 09:10:00 | 09:15:00 | ... | -1 | -1 | -1 | -1 | 13 | 0 | -1 | -1 | -1 | -1 |
| 2228595 | 20110101110074 | 4 | 525 | 0 | 0 | 0 | -1 | 0 | 09:15:00 | 18:00:00 | ... | -1 | -1 | -1 | -1 | 3 | 0 | -1 | -1 | -1 | -1 |
| 2228596 | 20110101110074 | 5 | 5 | 0 | 0 | 0 | -1 | 0 | 18:00:00 | 18:05:00 | ... | -1 | -1 | -1 | -1 | 13 | 0 | -1 | -1 | -1 | -1 |
5 rows × 29 columns
diary = elec[['TUCASEID', 'TUACTIVITY_N', 'TRCODEP', 'TUACTDUR24', 'TUSTARTTIM', 'TUSTOPTIME']].dropna()
diary.columns = ['caseid', 'actline', 'activity', 'duration', 'start', 'stop']
diary.head()
| caseid | actline | activity | duration | start | stop | |
|---|---|---|---|---|---|---|
| 2228592 | 20110101110074 | 1 | 110101 | 10 | 04:00:00 | 04:10:00 |
| 2228593 | 20110101110074 | 2 | 020402 | 300 | 04:10:00 | 09:10:00 |
| 2228594 | 20110101110074 | 3 | 180482 | 5 | 09:10:00 | 09:15:00 |
| 2228595 | 20110101110074 | 4 | 500101 | 525 | 09:15:00 | 18:00:00 |
| 2228596 | 20110101110074 | 5 | 180482 | 5 | 18:00:00 | 18:05:00 |
not_diary = not_elec[['TUCASEID', 'TUACTIVITY_N', 'TRCODEP', 'TUACTDUR24', 'TUSTARTTIM', 'TUSTOPTIME']].dropna()
not_diary.columns = ['caseid', 'actline', 'activity', 'duration', 'start', 'stop']
not_diary.head()
| caseid | actline | activity | duration | start | stop | |
|---|---|---|---|---|---|---|
| 1 | 20030100013280 | 1 | 130124 | 60 | 04:00:00 | 05:00:00 |
| 2 | 20030100013280 | 2 | 010201 | 30 | 05:00:00 | 05:30:00 |
| 3 | 20030100013280 | 3 | 010101 | 600 | 05:30:00 | 15:30:00 |
| 4 | 20030100013280 | 4 | 120303 | 150 | 15:30:00 | 18:00:00 |
| 5 | 20030100013280 | 5 | 110101 | 5 | 18:00:00 | 18:05:00 |
import activityDictionary
from importlib import reload
activityDictionary = reload(activityDictionary)
activityDictionary.activityDictionary("10499")
2
from timeStampDictionary import numberToOneFourFourZeroScale, timestampToNumber, clocktimeToNumber, numberToClocktime
import timeStampDictionary
timeStampDictionary = reload(timeStampDictionary)
timeStampDictionary.numberToClocktime(2)
'04:01:00'
numberToOneFourFourZeroScale(1)
255
timestampToNumber(15960)
27
clocktimeToNumber('13:31:00')
572
diary['lst_act'] = diary['activity'].astype(int).astype(str).apply(activityDictionary.activityDictionary)
diary.head()
| caseid | actline | activity | duration | start | stop | lst_act | |
|---|---|---|---|---|---|---|---|
| 2228592 | 20110101110074 | 1 | 110101 | 10 | 04:00:00 | 04:10:00 | 9 |
| 2228593 | 20110101110074 | 2 | 020402 | 300 | 04:10:00 | 09:10:00 | 3 |
| 2228594 | 20110101110074 | 3 | 180482 | 5 | 09:10:00 | 09:15:00 | 11 |
| 2228595 | 20110101110074 | 4 | 500101 | 525 | 09:15:00 | 18:00:00 | 11 |
| 2228596 | 20110101110074 | 5 | 180482 | 5 | 18:00:00 | 18:05:00 | 11 |
not_diary['lst_act'] = not_diary['activity'].astype(int).astype(str).apply(activityDictionary.activityDictionary)
not_diary.head()
| caseid | actline | activity | duration | start | stop | lst_act | |
|---|---|---|---|---|---|---|---|
| 1 | 20030100013280 | 1 | 130124 | 60 | 04:00:00 | 05:00:00 | 10 |
| 2 | 20030100013280 | 2 | 010201 | 30 | 05:00:00 | 05:30:00 | 2 |
| 3 | 20030100013280 | 3 | 010101 | 600 | 05:30:00 | 15:30:00 | 1 |
| 4 | 20030100013280 | 4 | 120303 | 150 | 15:30:00 | 18:00:00 | 8 |
| 5 | 20030100013280 | 5 | 110101 | 5 | 18:00:00 | 18:05:00 | 9 |
diary = diary.replace({'lst_act': {2: 10, 7: 10, 8: 10, 9: 10, 1: 10}})
not_diary = not_diary.replace({'lst_act': {2: 10, 7: 10, 8: 10, 9: 10, 1: 10}})
from itertools import repeat
diary['seq'] = diary['lst_act'].apply(lambda x: [x])
diary.head()
| caseid | actline | activity | duration | start | stop | lst_act | seq | |
|---|---|---|---|---|---|---|---|---|
| 2228592 | 20110101110074 | 1 | 110101 | 10 | 04:00:00 | 04:10:00 | 10 | [10] |
| 2228593 | 20110101110074 | 2 | 020402 | 300 | 04:10:00 | 09:10:00 | 3 | [3] |
| 2228594 | 20110101110074 | 3 | 180482 | 5 | 09:10:00 | 09:15:00 | 11 | [11] |
| 2228595 | 20110101110074 | 4 | 500101 | 525 | 09:15:00 | 18:00:00 | 11 | [11] |
| 2228596 | 20110101110074 | 5 | 180482 | 5 | 18:00:00 | 18:05:00 | 11 | [11] |
not_diary['seq'] = not_diary['lst_act'].apply(lambda x: [x])
not_diary.head()
| caseid | actline | activity | duration | start | stop | lst_act | seq | |
|---|---|---|---|---|---|---|---|---|
| 1 | 20030100013280 | 1 | 130124 | 60 | 04:00:00 | 05:00:00 | 10 | [10] |
| 2 | 20030100013280 | 2 | 010201 | 30 | 05:00:00 | 05:30:00 | 10 | [10] |
| 3 | 20030100013280 | 3 | 010101 | 600 | 05:30:00 | 15:30:00 | 10 | [10] |
| 4 | 20030100013280 | 4 | 120303 | 150 | 15:30:00 | 18:00:00 | 10 | [10] |
| 5 | 20030100013280 | 5 | 110101 | 5 | 18:00:00 | 18:05:00 | 10 | [10] |
diary['seq'] = diary['seq']*diary['duration'].astype(int)
diary.head()
| caseid | actline | activity | duration | start | stop | lst_act | seq | |
|---|---|---|---|---|---|---|---|---|
| 2228592 | 20110101110074 | 1 | 110101 | 10 | 04:00:00 | 04:10:00 | 10 | [10, 10, 10, 10, 10, 10, 10, 10, 10, 10] |
| 2228593 | 20110101110074 | 2 | 020402 | 300 | 04:10:00 | 09:10:00 | 3 | [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ... |
| 2228594 | 20110101110074 | 3 | 180482 | 5 | 09:10:00 | 09:15:00 | 11 | [11, 11, 11, 11, 11] |
| 2228595 | 20110101110074 | 4 | 500101 | 525 | 09:15:00 | 18:00:00 | 11 | [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1... |
| 2228596 | 20110101110074 | 5 | 180482 | 5 | 18:00:00 | 18:05:00 | 11 | [11, 11, 11, 11, 11] |
not_diary['seq'] = not_diary['seq']*not_diary['duration'].astype(int)
not_diary.head()
| caseid | actline | activity | duration | start | stop | lst_act | seq | |
|---|---|---|---|---|---|---|---|---|
| 1 | 20030100013280 | 1 | 130124 | 60 | 04:00:00 | 05:00:00 | 10 | [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1... |
| 2 | 20030100013280 | 2 | 010201 | 30 | 05:00:00 | 05:30:00 | 10 | [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1... |
| 3 | 20030100013280 | 3 | 010101 | 600 | 05:30:00 | 15:30:00 | 10 | [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1... |
| 4 | 20030100013280 | 4 | 120303 | 150 | 15:30:00 | 18:00:00 | 10 | [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1... |
| 5 | 20030100013280 | 5 | 110101 | 5 | 18:00:00 | 18:05:00 | 10 | [10, 10, 10, 10, 10] |
not_tempoTable ={}
for i in not_diary['caseid']:
not_tempoTable[i] = []
tempoTable ={}
for i in diary['caseid']:
tempoTable[i] = []
for i in diary.index:
tempoTable[diary.loc[i, 'caseid']].extend(diary.loc[i, 'seq'])
for i in not_diary.index:
not_tempoTable[not_diary.loc[i, 'caseid']].extend(not_diary.loc[i, 'seq'])
del diary
diary = pd.DataFrame(tempoTable).T
diary.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20110101110074 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20110101110522 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20110101110639 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20110101110961 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20110101111218 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
5 rows × 1440 columns
del not_diary
not_diary = pd.DataFrame(not_tempoTable).T
not_diary.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20030100013280 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20030100013848 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20030100014165 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20030100014169 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 20030100014209 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
5 rows × 1440 columns
diary.shape
(6758, 1440)
not_diary.shape
(73758, 1440)
diary.to_csv('diary.csv')
diary = diary.reset_index()
diary.head()
| index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20110101110074 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 1 | 20110101110522 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 2 | 20110101110639 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 3 | 20110101110961 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 4 | 20110101111218 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
5 rows × 1441 columns
not_diary = not_diary.reset_index()
not_diary.head()
| index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20030100013280 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 1 | 20030100013848 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 2 | 20030100014165 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 3 | 20030100014169 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 4 | 20030100014209 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
5 rows × 1441 columns
diary = diary.rename(columns = {"index": "caseid"})
diary.head()
| caseid | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20110101110074 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 1 | 20110101110522 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 2 | 20110101110639 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 3 | 20110101110961 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 4 | 20110101111218 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
5 rows × 1441 columns
not_diary = not_diary.rename(columns = {"index": "caseid"})
not_diary.head()
| caseid | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 1430 | 1431 | 1432 | 1433 | 1434 | 1435 | 1436 | 1437 | 1438 | 1439 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 20030100013280 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 1 | 20030100013848 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 2 | 20030100014165 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 3 | 20030100014169 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
| 4 | 20030100014209 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | ... | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 | 10 |
5 rows × 1441 columns
wt_diary = pd.merge(diary, weights[['caseid', 'adj_weights']], how='inner', on='caseid')
not_wt_diary = pd.merge(not_diary, weights[['caseid', 'adj_weights']], how='inner', on='caseid')
tud = pd.DataFrame()
for i in range(0, 1440):
tud = pd.concat([tud, diary[i].value_counts()])
not_tud = pd.DataFrame()
for i in range(0, 1440):
not_tud = pd.concat([not_tud, not_diary[i].value_counts()])
tud = tud.reset_index()
tud.head()
| index | 0 | |
|---|---|---|
| 0 | 10 | 6449.0 |
| 1 | 6 | 164.0 |
| 2 | 11 | 59.0 |
| 3 | 3 | 52.0 |
| 4 | 4 | 26.0 |
not_tud = not_tud.reset_index()
not_tud.head()
| index | 0 | |
|---|---|---|
| 0 | 10 | 70148.0 |
| 1 | 6 | 1973.0 |
| 2 | 3 | 584.0 |
| 3 | 11 | 525.0 |
| 4 | 4 | 479.0 |
tud['index'].unique()
array([10, 6, 11, 3, 4, 5])
not_tud['index'].unique()
array([10, 6, 3, 11, 4, 5])
import numpy as np
r = sorted(list(np.arange(1, 1441))*6)
tud['timeStamp'] = r
tud.head(20)
| index | 0 | timeStamp | |
|---|---|---|---|
| 0 | 10 | 6449.0 | 1 |
| 1 | 6 | 164.0 | 1 |
| 2 | 11 | 59.0 | 1 |
| 3 | 3 | 52.0 | 1 |
| 4 | 4 | 26.0 | 1 |
| 5 | 5 | 8.0 | 1 |
| 6 | 10 | 6448.0 | 2 |
| 7 | 6 | 165.0 | 2 |
| 8 | 11 | 58.0 | 2 |
| 9 | 3 | 53.0 | 2 |
| 10 | 4 | 26.0 | 2 |
| 11 | 5 | 8.0 | 2 |
| 12 | 10 | 6449.0 | 3 |
| 13 | 6 | 165.0 | 3 |
| 14 | 11 | 58.0 | 3 |
| 15 | 3 | 52.0 | 3 |
| 16 | 4 | 26.0 | 3 |
| 17 | 5 | 8.0 | 3 |
| 18 | 10 | 6449.0 | 4 |
| 19 | 6 | 166.0 | 4 |
not_tud['timeStamp'] = r
not_tud.head(20)
| index | 0 | timeStamp | |
|---|---|---|---|
| 0 | 10 | 70148.0 | 1 |
| 1 | 6 | 1973.0 | 1 |
| 2 | 3 | 584.0 | 1 |
| 3 | 11 | 525.0 | 1 |
| 4 | 4 | 479.0 | 1 |
| 5 | 5 | 49.0 | 1 |
| 6 | 10 | 70128.0 | 2 |
| 7 | 6 | 1977.0 | 2 |
| 8 | 3 | 590.0 | 2 |
| 9 | 11 | 535.0 | 2 |
| 10 | 4 | 482.0 | 2 |
| 11 | 5 | 46.0 | 2 |
| 12 | 10 | 70134.0 | 3 |
| 13 | 6 | 1981.0 | 3 |
| 14 | 3 | 580.0 | 3 |
| 15 | 11 | 535.0 | 3 |
| 16 | 4 | 483.0 | 3 |
| 17 | 5 | 45.0 | 3 |
| 18 | 10 | 70150.0 | 4 |
| 19 | 6 | 1980.0 | 4 |
tud.columns = ['index', 'count', 'timeStamp']
tud['activity'] = tud['index'].apply(activityDictionary.activityNumberToStringName)
tud.head()
| index | count | timeStamp | activity | |
|---|---|---|---|---|
| 0 | 10 | 6449.0 | 1 | Leisure |
| 1 | 6 | 164.0 | 1 | Work and Education |
| 2 | 11 | 59.0 | 1 | Travel and Other |
| 3 | 3 | 52.0 | 1 | Housework |
| 4 | 4 | 26.0 | 1 | Child Care |
not_tud.columns = ['index', 'count', 'timeStamp']
not_tud['activity'] = not_tud['index'].apply(activityDictionary.activityNumberToStringName)
not_tud.head()
| index | count | timeStamp | activity | |
|---|---|---|---|---|
| 0 | 10 | 70148.0 | 1 | Leisure |
| 1 | 6 | 1973.0 | 1 | Work and Education |
| 2 | 3 | 584.0 | 1 | Housework |
| 3 | 11 | 525.0 | 1 | Travel and Other |
| 4 | 4 | 479.0 | 1 | Child Care |
tud.loc[tud['timeStamp']==1, 'count'].sum()
6758.0
not_tud.loc[not_tud['timeStamp']==1, 'count'].sum()
73758.0
from datetime import datetime
tud['time'] = [datetime.fromtimestamp(i*60) for i in tud['timeStamp']]
tud.head()
| index | count | timeStamp | activity | time | |
|---|---|---|---|---|---|
| 0 | 10 | 6449.0 | 1 | Leisure | 1970-01-01 01:01:00 |
| 1 | 6 | 164.0 | 1 | Work and Education | 1970-01-01 01:01:00 |
| 2 | 11 | 59.0 | 1 | Travel and Other | 1970-01-01 01:01:00 |
| 3 | 3 | 52.0 | 1 | Housework | 1970-01-01 01:01:00 |
| 4 | 4 | 26.0 | 1 | Child Care | 1970-01-01 01:01:00 |
not_tud['time'] = [datetime.fromtimestamp(i*60) for i in not_tud['timeStamp']]
not_tud.head()
| index | count | timeStamp | activity | time | |
|---|---|---|---|---|---|
| 0 | 10 | 70148.0 | 1 | Leisure | 1970-01-01 01:01:00 |
| 1 | 6 | 1973.0 | 1 | Work and Education | 1970-01-01 01:01:00 |
| 2 | 3 | 584.0 | 1 | Housework | 1970-01-01 01:01:00 |
| 3 | 11 | 525.0 | 1 | Travel and Other | 1970-01-01 01:01:00 |
| 4 | 4 | 479.0 | 1 | Child Care | 1970-01-01 01:01:00 |
tud['perc'] = round(tud['count']*100/tud.loc[tud['timeStamp']==1, 'count'].sum(), 2)
tud.head()
| index | count | timeStamp | activity | time | perc | |
|---|---|---|---|---|---|---|
| 0 | 10 | 6449.0 | 1 | Leisure | 1970-01-01 01:01:00 | 95.43 |
| 1 | 6 | 164.0 | 1 | Work and Education | 1970-01-01 01:01:00 | 2.43 |
| 2 | 11 | 59.0 | 1 | Travel and Other | 1970-01-01 01:01:00 | 0.87 |
| 3 | 3 | 52.0 | 1 | Housework | 1970-01-01 01:01:00 | 0.77 |
| 4 | 4 | 26.0 | 1 | Child Care | 1970-01-01 01:01:00 | 0.38 |
not_tud['perc'] = round(not_tud['count']*100/not_tud.loc[not_tud['timeStamp']==1, 'count'].sum(), 2)
not_tud.head()
| index | count | timeStamp | activity | time | perc | |
|---|---|---|---|---|---|---|
| 0 | 10 | 70148.0 | 1 | Leisure | 1970-01-01 01:01:00 | 95.11 |
| 1 | 6 | 1973.0 | 1 | Work and Education | 1970-01-01 01:01:00 | 2.67 |
| 2 | 3 | 584.0 | 1 | Housework | 1970-01-01 01:01:00 | 0.79 |
| 3 | 11 | 525.0 | 1 | Travel and Other | 1970-01-01 01:01:00 | 0.71 |
| 4 | 4 | 479.0 | 1 | Child Care | 1970-01-01 01:01:00 | 0.65 |
tud['count'].sum()
9731520.0
not_tud['count'].sum()
106211520.0
tud['timeStamp'].describe()
count 8640.000000 mean 720.500000 std 415.716152 min 1.000000 25% 360.750000 50% 720.500000 75% 1080.250000 max 1440.000000 Name: timeStamp, dtype: float64
tud.to_csv('tud.csv')
not_tud.to_csv('not_tud.csv')
import altair as alt
alt.data_transformers.disable_max_rows()
areaP = alt.Chart(tud).mark_area().encode(
x = "timeStamp:T",
y=alt.Y('count:Q', stack="normalize", axis=alt.Axis(format='%')),
color="activity:N"
)
nearest = alt.selection(type='single', nearest=True, on='mouseover',
fields=['timeStamp'], empty='none')
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(tud).mark_point().encode(
x='timeStamp:T',
opacity=alt.value(0),
).add_selection(
nearest
)
# Draw points on the line, and highlight based on selection
points = areaP.mark_point().encode(
opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
# Draw text labels near the points, and highlight based on selection
text = areaP.mark_text(align='left', dx=5, dy=-5).encode(
text=alt.condition(nearest, 'count:Q', alt.value(' '))
)
# Draw a rule at the location of the selection
rules = alt.Chart(tud).mark_rule(color='gray').encode(
x='timeStamp:T',
).transform_filter(
nearest
)
# Put the five layers into a chart and bind the data
alt.layer(
areaP, selectors, points, rules, text
).properties(
width=600, height=300
)
alt.data_transformers.disable_max_rows()
areaP = alt.Chart(tud).mark_area().encode(
x = "timeStamp:T",
y=alt.Y('perc:Q', stack="normalize", axis=alt.Axis(format='%')),
color="activity:N"
)
nearest = alt.selection(type='single', nearest=True, on='mouseover',
fields=['timeStamp'], empty='none')
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(tud).mark_point().encode(
x='timeStamp:T',
opacity=alt.value(0),
).add_selection(
nearest
)
# Draw points on the line, and highlight based on selection
points = areaP.mark_point().encode(
opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
# Draw text labels near the points, and highlight based on selection
text = areaP.mark_text(align='left', dx=5, dy=-5,fontSize=22).encode(
text=alt.condition(nearest, "perc:Q", alt.value(' '))
)
# Draw a rule at the location of the selection
rules = alt.Chart(tud).mark_rule(color='gray').encode(
x='timeStamp:T',
).transform_filter(
nearest
)
# Put the five layers into a chart and bind the data
alt.layer(
areaP, selectors, points, rules, text
).properties(
width=600, height=300
)
alt.data_transformers.disable_max_rows()
alt.Chart(tud[tud['activity']=='Work and Education']).mark_area().encode(
x = alt.X("time:T", axis=alt.Axis(format='%H:%M %p')),
y=alt.Y('perc:Q', title = "Percent of Observations")
).properties(
title = "Adult Care"
)
alt.Chart(tud).mark_bar().encode(
x=alt.X('sum(count)', stack="normalize", axis=alt.Axis(format='%')),
color="activity:N"
)
alt.Chart(tud).transform_joinaggregate(
TotalTime='sum(count)',
).transform_joinaggregate(
TotalTimeByAct='sum(count)',
groupby=['activity']
).transform_calculate(
PercentOfTotal="datum.TotalTimeByAct / datum.TotalTime"
).mark_bar().encode(
alt.X('PercentOfTotal:Q', axis=alt.Axis(format='%')),
y=alt.Y('activity:N', sort='x'),
color='activity:N',
tooltip=alt.Tooltip('PercentOfTotal:Q', title="Percentage", format='.0%')
)
htotal_df =pd.merge(not_tud, tud, how = 'outer', on = ['timeStamp', 'activity', 'index', 'time'])
htotal_df.head()
| index | count_x | timeStamp | activity | time | perc_x | count_y | perc_y | |
|---|---|---|---|---|---|---|---|---|
| 0 | 10 | 70148.0 | 1 | Leisure | 1970-01-01 01:01:00 | 95.11 | 6449.0 | 95.43 |
| 1 | 6 | 1973.0 | 1 | Work and Education | 1970-01-01 01:01:00 | 2.67 | 164.0 | 2.43 |
| 2 | 3 | 584.0 | 1 | Housework | 1970-01-01 01:01:00 | 0.79 | 52.0 | 0.77 |
| 3 | 11 | 525.0 | 1 | Travel and Other | 1970-01-01 01:01:00 | 0.71 | 59.0 | 0.87 |
| 4 | 4 | 479.0 | 1 | Child Care | 1970-01-01 01:01:00 | 0.65 | 26.0 | 0.38 |
htotal_df.to_csv('htotal_df.csv')
htotal_df['count'] = round(htotal_df['count_x']*0.2 + htotal_df['count_y']*0.8, 2)
htotal_df['perc'] = round(htotal_df['count']*100/htotal_df.loc[htotal_df['timeStamp']==1, 'count'].sum(), 2)
htotal_df['perc'] = round(htotal_df['perc_x']*0.2 + htotal_df['perc_y']*0.8, 2)
htotal_df.head(20)
| index | count_x | timeStamp | activity | time | perc_x | count_y | perc_y | count | perc | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10 | 70148.0 | 1 | Leisure | 1970-01-01 01:01:00 | 95.11 | 6449.0 | 95.43 | 19188.8 | 95.37 |
| 1 | 6 | 1973.0 | 1 | Work and Education | 1970-01-01 01:01:00 | 2.67 | 164.0 | 2.43 | 525.8 | 2.48 |
| 2 | 3 | 584.0 | 1 | Housework | 1970-01-01 01:01:00 | 0.79 | 52.0 | 0.77 | 158.4 | 0.77 |
| 3 | 11 | 525.0 | 1 | Travel and Other | 1970-01-01 01:01:00 | 0.71 | 59.0 | 0.87 | 152.2 | 0.84 |
| 4 | 4 | 479.0 | 1 | Child Care | 1970-01-01 01:01:00 | 0.65 | 26.0 | 0.38 | 116.6 | 0.43 |
| 5 | 5 | 49.0 | 1 | Adult Care | 1970-01-01 01:01:00 | 0.07 | 8.0 | 0.12 | 16.2 | 0.11 |
| 6 | 10 | 70128.0 | 2 | Leisure | 1970-01-01 01:02:00 | 95.08 | 6448.0 | 95.41 | 19184.0 | 95.34 |
| 7 | 6 | 1977.0 | 2 | Work and Education | 1970-01-01 01:02:00 | 2.68 | 165.0 | 2.44 | 527.4 | 2.49 |
| 8 | 3 | 590.0 | 2 | Housework | 1970-01-01 01:02:00 | 0.80 | 53.0 | 0.78 | 160.4 | 0.78 |
| 9 | 11 | 535.0 | 2 | Travel and Other | 1970-01-01 01:02:00 | 0.73 | 58.0 | 0.86 | 153.4 | 0.83 |
| 10 | 4 | 482.0 | 2 | Child Care | 1970-01-01 01:02:00 | 0.65 | 26.0 | 0.38 | 117.2 | 0.43 |
| 11 | 5 | 46.0 | 2 | Adult Care | 1970-01-01 01:02:00 | 0.06 | 8.0 | 0.12 | 15.6 | 0.11 |
| 12 | 10 | 70134.0 | 3 | Leisure | 1970-01-01 01:03:00 | 95.09 | 6449.0 | 95.43 | 19186.0 | 95.36 |
| 13 | 6 | 1981.0 | 3 | Work and Education | 1970-01-01 01:03:00 | 2.69 | 165.0 | 2.44 | 528.2 | 2.49 |
| 14 | 3 | 580.0 | 3 | Housework | 1970-01-01 01:03:00 | 0.79 | 52.0 | 0.77 | 157.6 | 0.77 |
| 15 | 11 | 535.0 | 3 | Travel and Other | 1970-01-01 01:03:00 | 0.73 | 58.0 | 0.86 | 153.4 | 0.83 |
| 16 | 4 | 483.0 | 3 | Child Care | 1970-01-01 01:03:00 | 0.65 | 26.0 | 0.38 | 117.4 | 0.43 |
| 17 | 5 | 45.0 | 3 | Adult Care | 1970-01-01 01:03:00 | 0.06 | 8.0 | 0.12 | 15.4 | 0.11 |
| 18 | 10 | 70150.0 | 4 | Leisure | 1970-01-01 01:04:00 | 95.11 | 6449.0 | 95.43 | 19189.2 | 95.37 |
| 19 | 6 | 1980.0 | 4 | Work and Education | 1970-01-01 01:04:00 | 2.68 | 166.0 | 2.46 | 528.8 | 2.50 |
alt.Chart(htotal_df).transform_joinaggregate(
TotalTime='sum(count)',
).transform_joinaggregate(
TotalTimeByAct='sum(count)',
groupby=['activity']
).transform_calculate(
PercentOfTotal="datum.TotalTimeByAct / datum.TotalTime"
).mark_bar().encode(
alt.X('PercentOfTotal:Q', axis=alt.Axis(format='%')),
y=alt.Y('activity:N', sort='x'),
color='activity:N',
tooltip=alt.Tooltip('PercentOfTotal:Q', title="Percentage", format='.0%')
)